{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook Used to Generate Benchmark Results "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import polars_ds as pds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parallel ML metrics evaluations on segments \n",
    "\n",
    "Use cases:\n",
    "\n",
    "1. Evaluate ML model performance in market A, B, C.\n",
    "2. The Dataframe contains a column that defines the \"split\" of the dataframe. Then this can simulatneously evaluate ML model's performances on each of the train, test, recent, or any other split you have.\n",
    "3. Evaluate ML model performance over time, e.g. weekly / monthly \n",
    "\n",
    "Comparison: \n",
    "\n",
    "Polars + PDS vs Pandas + Sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate a \n",
    "from datetime import date\n",
    "\n",
    "dates = pl.date_range(date(2020, 1, 1), date(2024, 10, 1), \"1d\", eager=True)\n",
    "df = pds.frame(size=len(dates)).select(\n",
    "    pds.random().alias(\"predicted\"),\n",
    "    (pds.random() > 0.25).cast(pl.UInt8).alias(\"actual_target\"),\n",
    "    dates = dates,\n",
    ")\n",
    "df_pd = df.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predicted</th>\n",
       "      <th>actual_target</th>\n",
       "      <th>dates</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.413767</td>\n",
       "      <td>1</td>\n",
       "      <td>2020-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.125783</td>\n",
       "      <td>1</td>\n",
       "      <td>2020-01-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.382943</td>\n",
       "      <td>1</td>\n",
       "      <td>2020-01-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.690455</td>\n",
       "      <td>0</td>\n",
       "      <td>2020-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.492488</td>\n",
       "      <td>0</td>\n",
       "      <td>2020-01-05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1731</th>\n",
       "      <td>0.365318</td>\n",
       "      <td>1</td>\n",
       "      <td>2024-09-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1732</th>\n",
       "      <td>0.635105</td>\n",
       "      <td>1</td>\n",
       "      <td>2024-09-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1733</th>\n",
       "      <td>0.156054</td>\n",
       "      <td>1</td>\n",
       "      <td>2024-09-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1734</th>\n",
       "      <td>0.736704</td>\n",
       "      <td>1</td>\n",
       "      <td>2024-09-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1735</th>\n",
       "      <td>0.660525</td>\n",
       "      <td>1</td>\n",
       "      <td>2024-10-01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1736 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      predicted  actual_target      dates\n",
       "0      0.413767              1 2020-01-01\n",
       "1      0.125783              1 2020-01-02\n",
       "2      0.382943              1 2020-01-03\n",
       "3      0.690455              0 2020-01-04\n",
       "4      0.492488              0 2020-01-05\n",
       "...         ...            ...        ...\n",
       "1731   0.365318              1 2024-09-27\n",
       "1732   0.635105              1 2024-09-28\n",
       "1733   0.156054              1 2024-09-29\n",
       "1734   0.736704              1 2024-09-30\n",
       "1735   0.660525              1 2024-10-01\n",
       "\n",
       "[1736 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.3 ms ± 83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df_pd[\"year\"] = df['dates'].dt.year()\n",
    "df_pd.groupby([\"year\"]).apply(\n",
    "    lambda df_group: pd.Series({\n",
    "        \"count\": len(df_group[\"actual_target\"]),\n",
    "        \"roc_auc\": roc_auc_score(df_group[\"actual_target\"], df_group[\"predicted\"]),\n",
    "        \"log_loss\": roc_auc_score(df_group[\"actual_target\"], df_group[\"predicted\"])\n",
    "    })\n",
    "    , include_groups=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.13 ms ± 67.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df.group_by(pl.col(\"dates\").dt.year()).agg(\n",
    "    count = pl.len(),\n",
    "    roc_auc = pds.query_roc_auc(\"actual_target\", \"predicted\"),\n",
    "    log_loss = pds.query_log_loss(\"actual_target\", \"predicted\")\n",
    ").sort(\"dates\")\n",
    "# 1/5 of the time, less lines of code + easier to understand syntax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
